gcqn_qsmooth <- function(counts, gcGroups, bio){
  gcBinNormCounts <- matrix(NA, nrow=nrow(counts), ncol=ncol(counts), dimnames=list(rownames(counts),colnames(counts)))
  for(ii in 1:nlevels(gcGroups)){
    id <- which(gcGroups==levels(gcGroups)[ii])
    countBin <- counts[id,]
    qs <- qsmooth(countBin, group_factor=bio)
    normCountBin <- qs@qsmoothData
    normCountBin <- round(normCountBin)
    normCountBin[normCountBin<0] <- 0
    gcBinNormCounts[id,] <- normCountBin
  }
  return(gcBinNormCounts)
}

library(grDevices)
palette(c("black", "#DF536B", "#61D04F", "#2297E6", "#28E2E5", "#D03AF5", "#EEC21F", "gray62"))
library(GenomicAlignments)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
library(rafalib)
library(cqn)
## Loading required package: mclust
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
## Loading required package: nor1mix
## Loading required package: preprocessCore
## Loading required package: splines
## Loading required package: quantreg
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(Biobase)
library(edgeR)
## Loading required package: limma
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(mclust)
library(umap)
source("../../../methods/gcqn_validated.R")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.4
## ✔ tidyr   1.0.2     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## ✖ purrr::compact()    masks XVector::compact()
## ✖ dplyr::count()      masks matrixStats::count()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks GenomicAlignments::first(), S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dplyr::last()       masks GenomicAlignments::last()
## ✖ purrr::map()        masks mclust::map()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ purrr::simplify()   masks DelayedArray::simplify()
## ✖ dplyr::slice()      masks XVector::slice(), IRanges::slice()
library(viridis)
## Loading required package: viridisLite
library(qsmooth)
library(RUVSeq)
## Loading required package: EDASeq
## Loading required package: ShortRead
## 
## Attaching package: 'ShortRead'
## The following object is masked from 'package:dplyr':
## 
##     id
## The following object is masked from 'package:purrr':
## 
##     compose
## The following object is masked from 'package:tibble':
## 
##     view
plotGCHex <- function(gr, counts){
  counts2 <- counts
  df <- as_tibble(cbind(counts2,gc=mcols(gr)$gc))
  df <- gather(df, sample, value, -gc)
  ggplot(data=df, aes(x=gc, y=log(value+1)) ) + ylab("log(count + 1)") + xlab("GC content") + geom_hex(bins = 50) + theme_bw() + facet_wrap(~sample, nrow=2)
}
plotRD <- function(rd, celltype, region, col=NULL, ...){
  if(is.null(col)) col <- 1:nlevels(region)
  plot(rd, pch=as.numeric(celltype)+15, col=col[region], 
       xlab="Dimension 1", ylab="Dimension 2", ...)
  legend("bottomleft", c("neuronal", "non-neuronal"), pch=c(16,17), bty='n')
  legend("topleft", levels(region), pch=16, col=col[1:nlevels(region)], bty='n')
}

counts <- as.matrix(read.table("../../../data/fullard2019/boca_raw_count_matrix.tsv", header=TRUE, stringsAsFactors = FALSE))
peaks <- read.table("../../../data/fullard2019/boca_peaks_consensus_no_blacklisted_regions.bed", header=FALSE, stringsAsFactors = FALSE)
colnames(peaks) <- c("chromosome", "start", "end", "name")

peakNames <- peaks$name
sn <- substr(peaks$chromosome, 4, sapply(peaks$chromosome, nchar))
start <- peaks$start
end <- peaks$end
gr <- GRanges(seqnames=sn, ranges = IRanges(start=start, end=end), strand="*")
names(gr) <- peaks$name

# get GC content
ff <- FaFile("~/data/genomes/human/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa")
peakSeqs <- getSeq(x=ff, gr)
gcContentPeaks <- letterFrequency(peakSeqs, "GC",as.prob=TRUE)[,1]
gcGroups <- Hmisc::cut2(gcContentPeaks, g=20)
mcols(gr)$gc <- gcContentPeaks

# design
# the data should consist of 2 cell types (neurons and non-neurons) across 14 distinct brain regions of 5 individuals
colnames(counts) <- gsub(x=colnames(counts),pattern="^X", replacement="")
individual <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 1)))
names(individual) <- colnames(counts)
celltype <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 2)))
names(celltype) <- colnames(counts)
region <- factor(unlist(lapply(strsplit(colnames(counts), split="_"), "[[", 3)))
names(region) <- colnames(counts)
## they also define groups of regions: https://bendlj01.u.hpc.mssm.edu/multireg/
regionBig <- as.character(region)
regionBig[region %in% c("DLPFC", "OFC", "VLPFC", "ACC", "STC", "ITC", "PMC", "INS")] <- "NCX"
regionBig[region %in% c("NAC", "PUT")] <- "STR"
regionBig <- factor(regionBig)
rawCounts <- counts

Get normalized count matrices

DESeq2

library(DESeq2)
dds <- DESeqDataSetFromMatrix(counts,
                              colData=data.frame(individual, celltype, region),
                              design=as.formula("~ individual + celltype*region"))
# calculate DESeq2 size factors
dds <- estimateSizeFactors(dds)
# extract the size factors and save them in an object
sf <- sizeFactors(dds)
# divide each row by the vector of size factors
normCountsDESeq2 <- sweep(counts, 2, sizeFactors(dds), "/")

edgeR

library(edgeR)
d <- DGEList(counts)
d <- calcNormFactors(d)
nf <- d$samples$norm.factors
effLibSize <- colSums(counts) * nf
effLibSize_scaled <- effLibSize / mean(effLibSize)
# divide each row by the vector of size factors
normCountsEdgeR <- sweep(counts, 2, effLibSize_scaled, "/")

Full quantile normalization

normCountsFQ <- FQnorm(counts, type="median")

cqn

# run cqn model
cqnObj <- cqn(counts, x=gcContentPeaks, lengths=width(gr))
## Warning: The use of 'sig2' is deprecated; do specify 'sigma' (= sqrt(sig2))
## instead
# calculate normalized counts
normCountsCqn <- 2^(cqnObj$y + cqnObj$offset)

cqnplot(cqnObj, n=1)

cqnplot(cqnObj, n=2)

# run cqn model
cqnObj_noLength <- cqn(counts, x=gcContentPeaks, lengths=1000, lengthMethod="fixed")
## Warning: The use of 'sig2' is deprecated; do specify 'sigma' (= sqrt(sig2))
## instead
# calculate normalized counts
normCountsCqn_noLength <- 2^(cqnObj$y + cqnObj$offset)

EDASeq

library(EDASeq)
wit <- withinLaneNormalization(as.matrix(counts), gcContentPeaks, which="full")
normCountsEDASeq <- betweenLaneNormalization(wit, which="full")
rm(wit)

GC-QN

gcGroups <- Hmisc::cut2(gcContentPeaks, g=50)
normCountsGcqn <- gcqn(counts, gcGroups, "median")

Smooth GC-QN

normCountsGcqnSmooth <- gcqn_qsmooth(counts, gcGroups, bio=droplevels(interaction(celltype, region)))

PCA

set.seed(44)

pcaPlot <- function(pca, regionBig, celltype){
  require(ggplot2)
  pctVar <- pca$sdev^2 / sum(pca$sdev^2)
  df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2])
  df %>% ggplot(., aes(x=PC1, y=PC2, colour=regionBig, shape=celltype)) +
    geom_point(size=3) +
    theme_classic() + 
    xlab(paste0("PC 1 (",round(pctVar[1],3)*100,"%)")) +
    ylab(paste0("PC 2 (",round(pctVar[2],3)*100,"%)"))
}



normMethods <- c("Raw", "DESeq2", "EdgeR", "FQ", "EDASeq", "Cqn", "Gcqn", "GcqnSmooth")
titles <- c("None", "DESeq2", "edgeR", "Full quantile", "FQ-FQ", "cqn", "GC-FQ", "smooth GC-FQ")
pcaPlots <- list()
for(mm in 1:length(normMethods)){
    curMethod <- normMethods[mm]
    if(curMethod == "Raw"){
      curCounts <- as.matrix(counts)
    } else {
      curCounts <- as.matrix(get(paste0("normCounts",curMethod)))
    }
    pca <- prcomp(t(log1p(curCounts)))
    pctVar <- pca$sdev^2 / sum(pca$sdev^2)
    pcaPlots[[mm]] <- pcaPlot(pca, regionBig, celltype) + ggtitle(titles[mm])
}
names(pcaPlots) <- normMethods
cowplot::plot_grid(plotlist=pcaPlots)

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_pcaPlots.pdf", width=12, height=12)

Hierarchical tree

library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.13.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:Biostrings':
## 
##     nnodes
## The following object is masked from 'package:stats':
## 
##     cutree
ddRaw <- dist(log1p(t(counts)), method="euclidean")
ddEdgeR <- dist(log1p(t(normCountsEdgeR)), method="euclidean")
ddDESeq2 <- dist(log1p(t(normCountsDESeq2)), method="euclidean")
ddFQ <- dist(log1p(t(normCountsFQ)), method="euclidean")
ddCqn <- dist(log1p(t(normCountsCqn)), method="euclidean")
ddEDASeq <- dist(log1p(t(normCountsEDASeq)), method="euclidean")
ddGCQN <- dist(log1p(t(normCountsGcqn)), method="euclidean")
ddGCQNSmooth <- dist(log1p(t(normCountsGcqnSmooth)), method="euclidean")


plotDD <- function(dist, col, label, main, cex=1/2, ...){
  require(dendextend)
  hdd <- hclust(dist)
  dhdd <- as.dendrogram(hdd)
  origLabels <- labels(dhdd)
  labels(dhdd) <- label[origLabels]
  labels_colors(dhdd) <- col[origLabels]
  dhdd <- set(dhdd, "labels_cex", cex)
  plot(dhdd, main=main, ...)
}


## cell type effect
colCelltype <- as.numeric(celltype)
names(colCelltype) <- names(celltype)
# pdf("~/Dropbox/research/atacseq/bulk/plots/hclust_celltype_fullard2018.pdf",
#     width=7, height=5)
plotDD(ddRaw, col=colCelltype, label=celltype, main="Raw counts: celltype effect")

plotDD(ddEdgeR, col=colCelltype, label=celltype, main="edgeR norm: celltype effect")

plotDD(ddDESeq2, col=colCelltype, label=celltype, main="DESeq2 norm: celltype effect")

plotDD(ddFQ, col=colCelltype, label=celltype, main="FQ norm: celltype effect")

plotDD(ddCqn, col=colCelltype, label=celltype, main="Cqn norm: celltype effect")

plotDD(ddEDASeq, col=colCelltype, label=celltype, main="FQ-FQ norm: celltype effect")

plotDD(ddGCQN, col=colCelltype, label=celltype, main="GC-FQ norm: celltype effect")

plotDD(ddGCQNSmooth, col=colCelltype, label=celltype, main="smooth GC-FQ norm: celltype effect")

# dev.off()

## individual effect
colIndividual <- as.numeric(individual)
names(colIndividual) <- names(individual)
plotDD(ddRaw, col=colIndividual, label=individual, main="Raw counts: individual effect")

plotDD(ddEdgeR, col=colIndividual, label=individual, main="edgeR norm: individual effect")

plotDD(ddDESeq2, col=colIndividual, label=individual, main="DESeq2 norm: individual effect")

plotDD(ddFQ, col=colIndividual, label=individual, main="FQ norm: individual effect")

plotDD(ddCqn, col=colIndividual, label=individual, main="Cqn norm: individual effect")

plotDD(ddEDASeq, col=colIndividual, label=individual, main="EDASeq norm: individual effect")

plotDD(ddGCQN, col=colIndividual, label=individual, main="GC-QN norm: individual effect")

plotDD(ddGCQNSmooth, col=colIndividual, label=individual, main="smooth GC-QN norm: individual effect")

## region effect
colRegion <- as.numeric(region)
names(colRegion) <- names(region)
plotDD(ddRaw, col=colRegion, label=region, main="Raw counts: region effect")

plotDD(ddEdgeR, col=colRegion, label=region, main="edgeR norm: region effect")

plotDD(ddDESeq2, col=colRegion, label=region, main="DESeq2 norm: region effect")

plotDD(ddFQ, col=colRegion, label=region, main="FQ norm: region effect")

plotDD(ddCqn, col=colRegion, label=region, main="Cqn norm: region effect")

plotDD(ddEDASeq, col=colRegion, label=region, main="EDASeq norm: region effect")

plotDD(ddGCQN, col=colRegion, label=region, main="GC-QN norm: region effect")

plotDD(ddGCQNSmooth, col=colRegion, label=region, main="smooth GC-QN norm: region effect")

clustering based on PCA

library(irlba)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## The following object is masked from 'package:S4Vectors':
## 
##     expand
library(uwot)
## 
## Attaching package: 'uwot'
## The following object is masked from 'package:umap':
## 
##     umap
library(cluster)
library(mclust)
library(ggplot2)
umapDR <- function(counts, nPC=6){
  pc <- irlba::irlba(log1p(counts), nv=nPC)
  umDR <- uwot::umap(pc$v)
  return(umDR)
}

pcs <- c(2, 5, 8, 10)
ariMatCT <- matrix(NA, nrow=8, ncol=length(pcs))
rownames(ariMatCT) <- c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "EDASeq", "GC-QN", "smooth GC-QN")
ariMatRegion <- matrix(NA, nrow=8, ncol=length(pcs))
rownames(ariMatRegion) <- c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "EDASeq", "GC-QN", "smooth GC-QN")
for(pp in 1:length(pcs)){
  set.seed(pp)
  nPC <- pcs[pp]
  
  umRaw <- irlba::irlba(log1p(counts), nv=nPC)$v
  umEdgeR <- irlba::irlba(log1p(normCountsEdgeR), nv=nPC)$v
  umDESeq2 <- irlba::irlba(log1p(normCountsDESeq2), nv=nPC)$v
  umFQ <- irlba::irlba(log1p(normCountsFQ), nv=nPC)$v
  umCqn <- irlba::irlba(log1p(normCountsCqn), nv=nPC)$v
  umEDASeq <- irlba::irlba(log1p(normCountsEDASeq), nv=nPC)$v
  umGcqn <- irlba::irlba(log1p(normCountsGcqn), nv=nPC)$v
  umGcqnSmooth <- irlba::irlba(log1p(normCountsGcqnSmooth), nv=nPC)$v

  
  plotRD(umRaw, celltype, regionBig, main="Raw counts")
  plotRD(umEdgeR, celltype, regionBig, main="edgeR counts")
  plotRD(umDESeq2, celltype, regionBig, main="DESeq2 counts")
  plotRD(umFQ, celltype, regionBig, main="FQ counts")
  plotRD(umCqn, celltype, regionBig, main="Cqn counts")
  plotRD(umEDASeq, celltype, regionBig, main="EDASeq counts")
  plotRD(umGcqn, celltype, regionBig, main="GCQN counts")
  plotRD(umGcqnSmooth, celltype, regionBig, main="smooth GCQN counts")

  ariMatCT[1,pp] <- mclust::adjustedRandIndex(pam(umRaw, k=2)$clustering, celltype)
  ariMatCT[2,pp] <- mclust::adjustedRandIndex(pam(umEdgeR, k=2)$clustering, celltype)
  ariMatCT[3,pp] <- mclust::adjustedRandIndex(pam(umDESeq2, k=2)$clustering, celltype)
  ariMatCT[4,pp] <- mclust::adjustedRandIndex(pam(umFQ, k=2)$clustering, celltype)
  ariMatCT[5,pp] <- mclust::adjustedRandIndex(pam(umCqn, k=2)$clustering, celltype)
  ariMatCT[6,pp] <- mclust::adjustedRandIndex(pam(umEDASeq, k=2)$clustering, celltype)
  ariMatCT[7,pp] <- mclust::adjustedRandIndex(pam(umGcqn, k=2)$clustering, celltype)
  ariMatCT[8,pp] <- mclust::adjustedRandIndex(pam(umGcqnSmooth, k=2)$clustering, celltype)

  ariMatRegion[1,pp] <- mclust::adjustedRandIndex(pam(umRaw, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[2,pp] <- mclust::adjustedRandIndex(pam(umEdgeR, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[3,pp] <- mclust::adjustedRandIndex(pam(umDESeq2, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[4,pp] <- mclust::adjustedRandIndex(pam(umFQ, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[5,pp] <- mclust::adjustedRandIndex(pam(umCqn, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[6,pp] <- mclust::adjustedRandIndex(pam(umEDASeq, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[7,pp] <- mclust::adjustedRandIndex(pam(umGcqn, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))
  ariMatRegion[8,pp] <- mclust::adjustedRandIndex(pam(umGcqnSmooth, k=nlevels(regionBig)*nlevels(celltype))$clustering, interaction(regionBig,celltype))

}

ariPlot <- function(ariMat){
  ariMat <- as.data.frame(ariMat)
  colnames(ariMat) <- paste0(pcs,"PC")
  ariMatLong <- tidyr::gather(ariMat)
    ariMatLong$method <- factor(rep(c("Raw", "edgeR", "DESeq2", "FQ", "Cqn", "FQ-FQ", "GC-FQ", "smooth GC-FQ"), length(pcs)))
  ariMatLong$gc <- ifelse(ariMatLong$method %in% c("Cqn", "FQ-FQ", "GC-FQ", "smooth GC-FQ"), TRUE, FALSE)
  ariMatLong$key <- factor(ariMatLong$key, levels=c("2PC", "5PC", "8PC", "10PC"))
  
  ggplot(ariMatLong, aes(x=key, y=value, color=method, shape=gc)) +
    geom_jitter(width=.1, height=0, size=3) +
    theme_classic() +
    xlab("Number of PCs") +
    ylab("Adjusted Rand Index") +
    scale_shape_discrete(name="GC-aware")
}

# for clustering according to cell type, the decrease in ARI is lower for GC-aware methods
# possibly because they don't let technical GC effects contaminate the PCs.
ariPlot(ariMatCT)

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard2018_ariCelltype.pdf", width = 8)
ariRegion <- ariPlot(ariMatRegion)
ariRegion

barplot(rowMeans(ariMatRegion), las=2)

DA analysis, neuronal vs non-neuronal

MA-plots and intersections

testEdgeR <- function(counts, design, L, tmm=FALSE, offset=NULL){
  d <- DGEList(counts)
  if(tmm) d <- calcNormFactors(d)
  if(!is.null(offset)) d$offset <- offset
  d <- estimateDisp(d, design)
  fit <- glmFit(d, design)
  lrt <- glmLRT(fit, contrast=L)
  lrt$table$padj <- p.adjust(lrt$table$PValue, "fdr")
  return(lrt)
}

covarDf <- data.frame(region, celltype, individual)
testDESeq2 <- function(counts, covarDf, L){
  dds <- DESeqDataSetFromMatrix(counts,
                              colData=covarDf,
                              design=as.formula("~ region*celltype + individual"))
  dds <- estimateSizeFactors(dds)
  dds <- DESeq(dds)
  res <- results(dds, contrast = L)
  return(res)
}

design <- model.matrix(~region*celltype + individual)
L_neur <- matrix(0, nrow=ncol(design), ncol=1)
rownames(L_neur) <- colnames(design)
L_neur[2:14,1] <- 1/14 #all regions of non-neuronal
L_neur["celltypeN",1] <- -1 #main neuronal effect
L_neur[grep(x=rownames(L_neur), pattern="^region.*celltypeN$"),1] <- -c(1/14) #all regions of neuronal

maPlotGC <- function(M, A, gc){
  require(ggplot2)
  df <- data.frame(M, A, gc)
  df %>% ggplot(., aes(x=M, y=A, colour=gc)) +
    geom_point(size=1/2, alpha=1/4) +
    scale_color_gradientn(colours=wesanderson::wes_palette("Zissou1", n=10, type="continuous"), name="GC content", breaks=quantile(gc, probs=seq(0,1,length=10)), labels=NULL) +
    theme_classic() + 
    theme(legend.position = "none") +
    geom_hline(aes(yintercept=0)) +
    geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
    xlab("Average log counts per million") +
    ylab("Log2 fold change")
}

maPlotGC_hex <- function(M, A, gc, ng=20){
  require(ggplot2)
  df <- data.frame(M, A, gc)
  df %>% ggplot(., aes(x=M, y=A)) +
    stat_summary_hex(aes(z = gc), fun="mean", show.legend=FALSE, bins = 20) +
    theme_classic() +
    scale_fill_gradientn(colours=wesanderson::wes_palette("Zissou1", n=ng, type="continuous"), name="GC content", labels=NULL) + #, limits=c(0.2,0.775)) +
    geom_hline(aes(yintercept=0)) +
    geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
    xlab("Average log counts per million") +
    ylab("Log2 fold change")
}

gcBoxplot <- function(df, title){
  ggplot(df) +
  aes(x=gc, y=logFC, color=gc) +
  geom_violin() +
  geom_boxplot(width=0.1) +
  scale_color_manual(values=wesanderson::wes_palette("Zissou1", nlevels(df$gc), "continuous")) +
  geom_abline(intercept = 0, slope = 0, col="black", lty=2) +
  theme_bw() +
  ylim(c(-2,2)) +
  xlab("GC-content bin") +
  ggtitle(title) +
  theme(axis.text.x = element_text(size=0),
        legend.position = "none",
        axis.title = element_text(size=16))
}



normMethods <- c("Raw", "DESeq2", "EdgeR", "FQ", "EDASeq", "Cqn", "Cqn_noLength", "Gcqn", "GcqnSmooth")
# lrtOut <- list()
# for(mm in 4:length(normMethods)){
#     curMethod <- normMethods[mm]
#     if(curMethod == "Raw"){
#       curCounts <- as.matrix(rawCounts)
#       lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur)
#       next
#     } else if (curMethod == "EdgeR"){
#       curCounts <- as.matrix(rawCounts)
#       lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, tmm=TRUE)
#       next
#     } else if (curMethod == "DESeq2"){
#       curCounts <- as.matrix(rawCounts)
#       lrtOut[[mm]] <- testDESeq2(curCounts, covarDf, L_neur)
#       next
#     } else if (curMethod == "Cqn"){
#       curCounts <- as.matrix(rawCounts)
#       lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, offset=cqnObj$glm.offset)
#       next
#     } else if(curMethod == "Cqn_noLength"){
#       curCounts <- as.matrix(rawCounts)
#       lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur, offset=cqnObj_noLength$glm.offset)
#       next
#     } else if(curMethod == "RUV"){
#       curCounts <- as.matrix(rawCounts)
#       lrtOut[[mm]] <- testEdgeR(curCounts, designRUV, L_neurRUV)
#     } else if(curMethod %in% c("FQ", "EDASeq", "Gcqn", "GcqnSmooth")){
#       curCounts <- as.matrix(get(paste0("normCounts",curMethod)))
#       lrtOut[[mm]] <- testEdgeR(curCounts, design, L_neur)
#     }
# }
# saveRDS(lrtOut, file="lrtOut2.rds")
lrtOut <- readRDS("lrtOut2.rds")

titles <- c("Raw", "DESeq2", "edgeR", "FQ", "FQ-FQ", "cqn", "cqn_noLength", "GC-FQ", "smooth GC-FQ")

pList <- c()
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    pList[[kk]] <- maPlotGC(M=aveLogCPM(normCountsDESeq2), A=lrtOut[[kk]]$log2FoldChange, gc=gcContentPeaks) +
    ggtitle(titles[kk])
  } else {
    pList[[kk]] <- maPlotGC(M=lrtOut[[kk]]$table$logCPM, A=lrtOut[[kk]]$table$logFC, gc=gcContentPeaks) +
    ggtitle(titles[kk])
  }
}
cowplot::plot_grid(plotlist=pList)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots.pdf", width=8, height=8)
# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots.png", width=8, height=8, dpi=150)


pListHex <- c()
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    pListHex[[kk]] <- maPlotGC_hex(M=aveLogCPM(normCountsDESeq2), A=lrtOut[[kk]]$log2FoldChange, gc=gcContentPeaks) +
    ggtitle(titles[kk]) + ylim(c(-5,5)) + coord_fixed()
  } else {
  pListHex[[kk]] <- maPlotGC_hex(M=lrtOut[[kk]]$table$logCPM, A=lrtOut[[kk]]$table$logFC, gc=gcContentPeaks) +
    ggtitle(titles[kk]) + ylim(c(-5,5)) + coord_fixed()
  }
}
names(pListHex) <- titles
cowplot::plot_grid(plotlist=pListHex)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing missing values (geom_hex).
## Warning: Removed 15 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_hex).
## Warning: Removed 1 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_maPlots_hex.pdf", width=8, height=8)


pListBox <- list()
gcGroups20 <- Hmisc::cut2(gcContentPeaks, g=20)
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    df <- data.frame(gc=gcGroups20, logFC=lrtOut[[kk]]$log2FoldChange)
    pListBox[[kk]] <- gcBoxplot(df, titles[kk])
  } else {
    df <- data.frame(gc=gcGroups20, logFC=lrtOut[[kk]]$table$logFC)
    pListBox[[kk]] <- gcBoxplot(df, titles[kk]) 
  }
}
cowplot::plot_grid(plotlist=pListBox)
## Warning: Removed 13086 rows containing non-finite values (stat_ydensity).
## Warning: Removed 13086 rows containing non-finite values (stat_boxplot).
## Warning: Removed 14262 rows containing non-finite values (stat_ydensity).
## Warning: Removed 14262 rows containing non-finite values (stat_boxplot).
## Warning: Removed 14740 rows containing non-finite values (stat_ydensity).
## Warning: Removed 14740 rows containing non-finite values (stat_boxplot).
## Warning: Removed 14761 rows containing non-finite values (stat_ydensity).
## Warning: Removed 14761 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11456 rows containing non-finite values (stat_ydensity).
## Warning: Removed 11456 rows containing non-finite values (stat_boxplot).
## Warning: Removed 22130 rows containing non-finite values (stat_ydensity).
## Warning: Removed 22130 rows containing non-finite values (stat_boxplot).
## Warning: Removed 17948 rows containing non-finite values (stat_ydensity).
## Warning: Removed 17948 rows containing non-finite values (stat_boxplot).
## Warning: Removed 13351 rows containing non-finite values (stat_ydensity).
## Warning: Removed 13351 rows containing non-finite values (stat_boxplot).
## Warning: Removed 10943 rows containing non-finite values (stat_ydensity).
## Warning: Removed 10943 rows containing non-finite values (stat_boxplot).

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_boxplots.pdf", width=16, height=10)


library(UpSetR)
daPeaks <- list()
for(kk in 1:length(lrtOut)){
  if(kk == 2){
    daPeaks[[kk]] <- which(lrtOut[[kk]]$padj <= 0.05)
  } else {
    daPeaks[[kk]] <- which(p.adjust(lrtOut[[kk]]$table$PValue, "fdr") <= 0.05)
  }
}
barplot(unlist(lapply(daPeaks, length)),
        names=normMethods, ylab="Nr DA peaks")

names(daPeaks) <- titles 
# pdf("~/Dropbox/research/atacseq/bulk/plots/fullard_upset.pdf")
upset(fromList(daPeaks), nsets=9, nintersects=10, keep.order=TRUE, order.by="freq")

# dev.off()

Plot for paper

M=lrtOut[[1]]$table$logCPM
A=lrtOut[[1]]$table$logFC
gc=gcContentPeaks
df <- data.frame(M, A, gc)
hlp <- df %>% ggplot(., aes(x=M, y=A)) +
    stat_summary_hex(aes(z = gc)) +
    theme_classic() +
    scale_fill_gradientn(colours=wesanderson::wes_palette("Zissou1", n=10, type="continuous"), name="GC content", labels=NULL) +
    geom_hline(aes(yintercept=0)) +
    geom_smooth(data=df, mapping=aes(x=M, y=A), se=FALSE) +
    xlab("Average log counts per million") +
    ylab("Log2 fold change")
leg <- ggpubr::get_legend(hlp)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cols <- wesanderson::wes_palette("Zissou1", n=10, type="continuous")

pcaAriPlots <- cowplot::plot_grid(pcaPlots[["EDASeq"]] + ggtitle("") + labs(colour="Region", shape="Cell type") + coord_fixed(), 
                                  ariRegion + scale_color_viridis_d(),
                                  nrow=2, ncol=1, labels=c("a", "b"))
maPlots <- cowplot::plot_grid(plotlist=pListHex[c("edgeR", "FQ", "FQ-FQ", "cqn")],
                     nrow=2, ncol=2)
## Warning: Removed 1 rows containing non-finite values (stat_summary_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_hex).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
cowplot::plot_grid(pcaAriPlots,
                   maPlots,
                   leg,
                   labels=c("", "c",""),
                   nrow=1, ncol=3,
                   rel_widths = c(1,1.2,.3))

# ggsave("~/Dropbox/research/atacseq/bulk/plots/fullard_mainFigure.pdf", width=13, height=9)

Biological interpretation

Overlap with known features

# overlap with known genomic features
# note that hg19 is synonym for GRCh37
library(ChIPpeakAnno)
## Loading required package: grid
## Loading required package: VennDiagram
## Loading required package: futile.logger
## 
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:dendextend':
## 
##     rotate
## 
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
## Loading required package: GenomicFeatures
## Loading required package: AnnotationDbi
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
library(EnsDb.Hsapiens.v75)
## Loading required package: ensembldb
## Loading required package: AnnotationFilter
## 
## Attaching package: 'ensembldb'
## The following object is masked from 'package:dplyr':
## 
##     filter
## The following object is masked from 'package:stats':
## 
##     filter
library(org.Hs.eg.db)
## 
data(TSS.human.GRCh37)
peakFeaturesAll <- suppressWarnings(assignChromosomeRegion(gr, nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesAll$percentage, las=1) # across all peaks

GO enrichment on intersection

Enriched GO terms on intersection of peaks make a lot of sense.

intersectDAPeaks <- Reduce(intersect, daPeaks)
peakFeaturesIst <- suppressWarnings(assignChromosomeRegion(gr[intersectDAPeaks],
                                                           nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
# intersection is enriched in expected biological features
barplot(peakFeaturesIst$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)

annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
                    mcols=mcols(annoData))
istPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[intersectDAPeaks],
                                                           AnnotationData=annoData,
                                                         output="nearestBiDirectionalPromoters",
                                                         bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goRes <- getEnrichedGO(istPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=.05, 
                       minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResUniq <- lapply(goRes, function(tab){
  tab2 <- unique(tab[,1:10])
  tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
  rownames(tab2) <- NULL
  tab2
  })

head(goResUniq$bp)
##        go.id
## 1 GO:0010243
## 2 GO:0000122
## 3 GO:0035601
## 4 GO:0031329
## 5 GO:0070431
## 6 GO:0098732
##                                                                    go.term
## 1                                      response to organonitrogen compound
## 2                negative regulation of transcription by RNA polymerase II
## 3                                                      protein deacylation
## 4                                 regulation of cellular catabolic process
## 5 nucleotide-binding oligomerization domain containing 2 signaling pathway
## 6                                                macromolecule deacylation
##                                                                                                                                                                                                                                                                                                  Definition
## 1 Any process that results in a change in state or activity of a cell or an organism (in terms of movement, secretion, enzyme production, gene expression, etc.) as a result of an organonitrogen stimulus. An organonitrogen compound is formally a compound containing at least one carbon-nitrogen bond.
## 2                                                                                                                                                                                Any process that stops, prevents, or reduces the frequency, rate or extent of transcription mediated by RNA polymerase II.
## 3                                                                                                                                                                               The removal of an acyl group, any group or radical of the form RCO- where R is an organic group, from a protein amino acid.
## 4                                                                                                                                Any process that modulates the frequency, rate or extent of the chemical reactions and pathways resulting in the breakdown of substances, carried out by individual cells.
## 5                                                                                                                                                                   Any series of molecular signals generated as a consequence of binding to nucleotide-binding oligomerization domain containing 2 (NOD2).
## 6                                                                                                                                                                                    The removal of an acyl group, any group or radical of the form RCO- where R is an organic group, from a macromolecule.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       BP 1.326880e-05             262            998             329316
## 2       BP 1.679815e-05             224            839             329316
## 3       BP 1.897572e-05              40            103             329316
## 4       BP 2.016790e-05             231            871             329316
## 5       BP 2.179805e-05              10             13             329316
## 6       BP 2.470798e-05              40            104             329316
##   totaltermInGenome BH.adjusted.p.value
## 1           1593489          0.03653406
## 2           1593489          0.03653406
## 3           1593489          0.03653406
## 4           1593489          0.03653406
## 5           1593489          0.03653406
## 6           1593489          0.03653406
head(goResUniq$mf)
##        go.id        go.term
## 1 GO:0019899 enzyme binding
##                                                    Definition Ontology
## 1 Interacting selectively and non-covalently with any enzyme.       MF
##         pvalue count.InDataset count.InGenome totaltermInDataset
## 1 1.592692e-05             529           2217              51719
##   totaltermInGenome BH.adjusted.p.value
## 1            255581          0.04131444
head(goResUniq$cc)
##        go.id                           go.term
## 1 GO:0017053 transcriptional repressor complex
## 2 GO:0045202                           synapse
## 3 GO:0097458                       neuron part
## 4 GO:0005654                       nucleoplasm
## 5 GO:0030054                     cell junction
## 6 GO:0042995                   cell projection
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Definition
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 A protein complex that possesses activity that prevents or downregulates transcription.
## 2 The junction between a nerve fiber of one neuron and another neuron, muscle fiber or glial cell. As the nerve fiber approaches the synapse it enlarges into a specialized structure, the presynaptic nerve ending, which contains mitochondria and synaptic vesicles. At the tip of the nerve ending is the presynaptic membrane; facing it, and separated from it by a minute cleft (the synaptic cleft) is a specialized area of membrane on the receiving cell, known as the postsynaptic membrane. In response to the arrival of nerve impulses, the presynaptic nerve ending secretes molecules of neurotransmitters into the synaptic cleft. These diffuse across the cleft and transmit the signal to the postsynaptic membrane.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Any constituent part of a neuron, the basic cellular unit of nervous tissue. A typical neuron consists of a cell body (often called the soma), an axon, and dendrites. Their purpose is to receive, conduct, and transmit impulses in the nervous system.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           That part of the nuclear content other than the chromosomes or the nucleolus.
## 5                                                                                                                                                                                                                                                                                                                                                                              A cellular component that forms a specialized region of connection between two or more cells or between a cell and the extracellular matrix. At a cell junction, anchoring proteins extend through the plasma membrane to link cytoskeletal proteins in one cell to cytoskeletal proteins in neighboring cells or to proteins in the extracellular matrix.
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              A prolongation or process extending from a cell, e.g. a flagellum or axon.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       CC 7.715988e-08              38             84              90270
## 2       CC 3.531184e-05             284           1171              90270
## 3       CC 7.818478e-05             401           1729              90270
## 4       CC 8.998435e-05             774           3512              90270
## 5       CC 9.418562e-05             308           1298              90270
## 6       CC 2.206383e-04             491           2178              90270
##   totaltermInGenome BH.adjusted.p.value
## 1            463063        0.0001124991
## 2            463063        0.0257423287
## 3            463063        0.0274645272
## 4            463063        0.0274645272
## 5            463063        0.0274645272
## 6            463063        0.0497110353
xtable::xtable(head(goResUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun  3 14:53:19 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0010243 & response to organonitrogen compound & 0.0365 \\ 
##   2 & GO:0000122 & negative regulation of transcription by RNA polymerase II & 0.0365 \\ 
##   3 & GO:0035601 & protein deacylation & 0.0365 \\ 
##   4 & GO:0031329 & regulation of cellular catabolic process & 0.0365 \\ 
##   5 & GO:0070431 & nucleotide-binding oligomerization domain containing 2 signaling pathway & 0.0365 \\ 
##   6 & GO:0098732 & macromolecule deacylation & 0.0365 \\ 
##   7 & GO:0050768 & negative regulation of neurogenesis & 0.0365 \\ 
##   8 & GO:0051961 & negative regulation of nervous system development & 0.0365 \\ 
##   9 & GO:0032990 & cell part morphogenesis & 0.0365 \\ 
##   10 & GO:0022008 & neurogenesis & 0.0398 \\ 
##   11 & GO:0007399 & nervous system development & 0.0398 \\ 
##   12 & GO:0045665 & negative regulation of neuron differentiation & 0.0398 \\ 
##   13 & GO:0120039 & plasma membrane bounded cell projection morphogenesis & 0.0398 \\ 
##   14 & GO:0006476 & protein deacetylation & 0.0398 \\ 
##   15 & GO:0048667 & cell morphogenesis involved in neuron differentiation & 0.0398 \\ 
##   16 & GO:0048709 & oligodendrocyte differentiation & 0.0398 \\ 
##   17 & GO:0048666 & neuron development & 0.0398 \\ 
##   18 & GO:0048858 & cell projection morphogenesis & 0.0459 \\ 
##   19 & GO:0007417 & central nervous system development & 0.0473 \\ 
##   20 & GO:1901698 & response to nitrogen compound & 0.0473 \\ 
##    \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun  3 14:53:19 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0017053 & transcriptional repressor complex & 0.0001 \\ 
##   2 & GO:0045202 & synapse & 0.0257 \\ 
##   3 & GO:0097458 & neuron part & 0.0275 \\ 
##   4 & GO:0005654 & nucleoplasm & 0.0275 \\ 
##   5 & GO:0030054 & cell junction & 0.0275 \\ 
##   6 & GO:0042995 & cell projection & 0.0497 \\ 
##   7 & GO:0005829 & cytosol & 0.0497 \\ 
##    \hline
## \end{tabular}
## \end{table}

GO enrichment on peaks only found by cqn

cqnPeaks <- unique(c(daPeaks$cqn, daPeaks$cqn_noLength))
otherPeaks <- unique(do.call(c, daPeaks[!names(daPeaks) %in% c("cqn", "cqn_noLength")]))
cqnUniquePeaks <- cqnPeaks[!cqnPeaks %in% otherPeaks]

### interesting genomic features are depleted, re-encouraging these could be FP results
peakFeaturesCqn <- suppressWarnings(assignChromosomeRegion(gr[cqnUniquePeaks],
                                                           nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesCqn$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)

## GO enrichment recovers no gene sets.
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
                    mcols=mcols(annoData))
cqnPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[cqnUniquePeaks],
                                                           AnnotationData=annoData,
                                                         output="nearestBiDirectionalPromoters",
                                                         bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goResCqn <- getEnrichedGO(cqnPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=.05, 
                       minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResCqnUniq <- lapply(goResCqn, function(tab){
  tab2 <- unique(tab[,1:10])
  tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
  rownames(tab2) <- NULL
  tab2
  })

head(goResCqnUniq$bp)
##  [1] go.id               go.term             Definition         
##  [4] Ontology            pvalue              count.InDataset    
##  [7] count.InGenome      totaltermInDataset  totaltermInGenome  
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
head(goResCqnUniq$mf)
##  [1] go.id               go.term             Definition         
##  [4] Ontology            pvalue              count.InDataset    
##  [7] count.InGenome      totaltermInDataset  totaltermInGenome  
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
head(goResCqnUniq$cc)
##  [1] go.id               go.term             Definition         
##  [4] Ontology            pvalue              count.InDataset    
##  [7] count.InGenome      totaltermInDataset  totaltermInGenome  
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
xtable::xtable(head(goResCqnUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun  3 14:54:08 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResCqnUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun  3 14:54:08 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## \hline
## \end{tabular}
## \end{table}
# fraction annotated
length(cqnPeaksAnnotated) / length(cqnUniquePeaks)
## [1] 0.04378117

GO enrichment on peaks only found by FQ-FQ and GC-FQ but not FQ

gcfqPeaks <- unique(c(daPeaks$`FQ-FQ`, daPeaks$`GC-FQ`))
fqPeaks <- daPeaks$FQ
gcfqUniquePeaks <- gcfqPeaks[!gcfqPeaks %in% fqPeaks]

### genomic features
peakFeaturesGcfq <- suppressWarnings(assignChromosomeRegion(gr[gcfqUniquePeaks],
                                                           nucleotideLevel=FALSE,
                                                  TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene))
barplot(peakFeaturesGcfq$percentage / peakFeaturesAll$percentage) ; abline(h=1, lty=2)

## GO enrichment recovers no gene sets.
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
sn <- as.character(seqnames(annoData))
sn <- factor(gsub(x=sn, pattern="chr", replacement=""))
annoData <- GRanges(seqnames=sn, ranges=ranges(annoData), strand=strand(annoData),
                    mcols=mcols(annoData))
gcfqPeaksAnnotated <- suppressWarnings(annotatePeakInBatch(gr[gcfqUniquePeaks],
                                                           AnnotationData=annoData,
                                                         output="nearestBiDirectionalPromoters",
                                                         bindingRegion=c(-2000, 500)))
## Annotate peaks by annoPeaks, see ?annoPeaks for details.
## maxgap will be ignored.
goResgcfq <- getEnrichedGO(gcfqPeaksAnnotated, orgAnn="org.Hs.eg.db", maxP=.05, 
                       minGOterm=10, multiAdjMethod="BH", condense=FALSE)
goResGcfqUniq <- lapply(goResgcfq, function(tab){
  tab2 <- unique(tab[,1:10])
  tab2 <- tab2[order(tab2$pvalue, decreasing=FALSE),]
  rownames(tab2) <- NULL
  tab2
  })

head(goResGcfqUniq$bp)
##        go.id                                               go.term
## 1 GO:0006813                               potassium ion transport
## 2 GO:0032990                               cell part morphogenesis
## 3 GO:0120039 plasma membrane bounded cell projection morphogenesis
## 4 GO:0048858                         cell projection morphogenesis
## 5 GO:0048812                       neuron projection morphogenesis
## 6 GO:0043266                 regulation of potassium ion transport
##                                                                                                                                                                                                        Definition
## 1                                                             The directed movement of potassium ions (K+) into, out of or within a cell, or between cells, by means of some agent such as a transporter or pore.
## 2                                                                                                                      The process in which the anatomical structures of a cell part are generated and organized.
## 3                                                                                        The process in which the anatomical structures of a plasma membrane bounded cell projection are generated and organized.
## 4                                                                                                                The process in which the anatomical structures of a cell projection are generated and organized.
## 5                 The process in which the anatomical structures of a neuron projection are generated and organized. A neuron projection is any process extending from a neural cell, such as axons or dendrites.
## 6 Any process that modulates the frequency, rate or extent of the directed movement of potassium ions (K+) into, out of or within a cell, or between cells, by means of some agent such as a transporter or pore.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       BP 8.204788e-06              48            240             166187
## 2       BP 1.084568e-05             108            685             166187
## 3       BP 1.097839e-05             105            662             166187
## 4       BP 1.421059e-05             105            666             166187
## 5       BP 1.985465e-05             102            648             166187
## 6       BP 2.227999e-05              26            105             166187
##   totaltermInGenome BH.adjusted.p.value
## 1           1593489          0.02700587
## 2           1593489          0.02700587
## 3           1593489          0.02700587
## 4           1593489          0.02700587
## 5           1593489          0.02700587
## 6           1593489          0.02700587
head(goResGcfqUniq$mf)
##  [1] go.id               go.term             Definition         
##  [4] Ontology            pvalue              count.InDataset    
##  [7] count.InGenome      totaltermInDataset  totaltermInGenome  
## [10] BH.adjusted.p.value
## <0 rows> (or 0-length row.names)
head(goResGcfqUniq$cc)
##        go.id                     go.term
## 1 GO:0045202                     synapse
## 2 GO:0036477 somatodendritic compartment
## 3 GO:0097458                 neuron part
## 4 GO:0044456                synapse part
## 5 GO:0043005           neuron projection
## 6 GO:0030425                    dendrite
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Definition
## 1 The junction between a nerve fiber of one neuron and another neuron, muscle fiber or glial cell. As the nerve fiber approaches the synapse it enlarges into a specialized structure, the presynaptic nerve ending, which contains mitochondria and synaptic vesicles. At the tip of the nerve ending is the presynaptic membrane; facing it, and separated from it by a minute cleft (the synaptic cleft) is a specialized area of membrane on the receiving cell, known as the postsynaptic membrane. In response to the arrival of nerve impulses, the presynaptic nerve ending secretes molecules of neurotransmitters into the synaptic cleft. These diffuse across the cleft and transmit the signal to the postsynaptic membrane.
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  The region of a neuron that includes the cell body (cell soma) and dendrite(s), but excludes the axon.
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Any constituent part of a neuron, the basic cellular unit of nervous tissue. A typical neuron consists of a cell body (often called the soma), an axon, and dendrites. Their purpose is to receive, conduct, and transmit impulses in the nervous system.
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   Any constituent part of a synapse, the junction between a nerve fiber of one neuron and another neuron or muscle fiber or glial cell.
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        A prolongation or process extending from a nerve cell, e.g. an axon or dendrite.
## 6                                                                                                                                                                                                                                                                                                                                                   A neuron projection that has a short, tapering, morphology. Dendrites receive and integrate signals from other neurons or from sensory stimuli, and conduct nerve impulses towards the axon or the cell body. In most neurons, the impulse is conveyed from dendrites to axon via the cell body, but in some types of unipolar neuron, the impulse does not travel via the cell body.
##   Ontology       pvalue count.InDataset count.InGenome totaltermInDataset
## 1       CC 1.022663e-07             185           1171              49927
## 2       CC 5.990199e-06             133            843              49927
## 3       CC 9.847623e-06             244           1729              49927
## 4       CC 1.095598e-05             144            938              49927
## 5       CC 2.843742e-05             189           1312              49927
## 6       CC 3.236236e-05             100            620              49927
##   totaltermInGenome BH.adjusted.p.value
## 1            463063        0.0001223105
## 2            463063        0.0032758371
## 3            463063        0.0032758371
## 4            463063        0.0032758371
## 5            463063        0.0048911883
## 6            463063        0.0048911883
xtable::xtable(head(goResGcfqUniq$bp[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun  3 14:55:02 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0006813 & potassium ion transport & 0.0270 \\ 
##   2 & GO:0032990 & cell part morphogenesis & 0.0270 \\ 
##   3 & GO:0120039 & plasma membrane bounded cell projection morphogenesis & 0.0270 \\ 
##   4 & GO:0048858 & cell projection morphogenesis & 0.0270 \\ 
##   5 & GO:0048812 & neuron projection morphogenesis & 0.0270 \\ 
##   6 & GO:0043266 & regulation of potassium ion transport & 0.0270 \\ 
##   7 & GO:0098655 & cation transmembrane transport & 0.0270 \\ 
##   8 & GO:0031175 & neuron projection development & 0.0270 \\ 
##   9 & GO:0048666 & neuron development & 0.0270 \\ 
##   10 & GO:0030182 & neuron differentiation & 0.0270 \\ 
##   11 & GO:0120036 & plasma membrane bounded cell projection organization & 0.0320 \\ 
##   12 & GO:0050808 & synapse organization & 0.0320 \\ 
##   13 & GO:0099536 & synaptic signaling & 0.0320 \\ 
##   14 & GO:0099537 & trans-synaptic signaling & 0.0331 \\ 
##   15 & GO:0071804 & cellular potassium ion transport & 0.0352 \\ 
##   16 & GO:0071805 & potassium ion transmembrane transport & 0.0352 \\ 
##   17 & GO:0055085 & transmembrane transport & 0.0408 \\ 
##   18 & GO:0050804 & modulation of chemical synaptic transmission & 0.0467 \\ 
##   19 & GO:0030030 & cell projection organization & 0.0467 \\ 
##   20 & GO:0051641 & cellular localization & 0.0467 \\ 
##    \hline
## \end{tabular}
## \end{table}
xtable::xtable(head(goResGcfqUniq$cc[,c(1:2,10)], n=20), digits=4)
## % latex table generated in R 3.6.2 by xtable 1.8-4 package
## % Wed Jun  3 14:55:02 2020
## \begin{table}[ht]
## \centering
## \begin{tabular}{rllr}
##   \hline
##  & go.id & go.term & BH.adjusted.p.value \\ 
##   \hline
## 1 & GO:0045202 & synapse & 0.0001 \\ 
##   2 & GO:0036477 & somatodendritic compartment & 0.0033 \\ 
##   3 & GO:0097458 & neuron part & 0.0033 \\ 
##   4 & GO:0044456 & synapse part & 0.0033 \\ 
##   5 & GO:0043005 & neuron projection & 0.0049 \\ 
##   6 & GO:0030425 & dendrite & 0.0049 \\ 
##   7 & GO:0098978 & glutamatergic synapse & 0.0049 \\ 
##   8 & GO:0030424 & axon & 0.0049 \\ 
##   9 & GO:0097447 & dendritic tree & 0.0049 \\ 
##   10 & GO:1904813 & ficolin-1-rich granule lumen & 0.0055 \\ 
##   11 & GO:0098794 & postsynapse & 0.0087 \\ 
##   12 & GO:0033267 & axon part & 0.0087 \\ 
##   13 & GO:0120025 & plasma membrane bounded cell projection & 0.0108 \\ 
##   14 & GO:0097060 & synaptic membrane & 0.0117 \\ 
##   15 & GO:0042995 & cell projection & 0.0193 \\ 
##   16 & GO:0045211 & postsynaptic membrane & 0.0193 \\ 
##   17 & GO:1990234 & transferase complex & 0.0374 \\ 
##   18 & GO:1990752 & microtubule end & 0.0405 \\ 
##   19 & GO:0034703 & cation channel complex & 0.0428 \\ 
##   20 & GO:0000777 & condensed chromosome kinetochore & 0.0428 \\ 
##    \hline
## \end{tabular}
## \end{table}
# fraction annotated
length(gcfqPeaksAnnotated) / length(gcfqUniquePeaks)
## [1] 0.1387382

Genomic feature plots

# pdf("~/Dropbox/research/atacseq/bulk/plots/fullard_genomicFeatures.pdf", width=9)
par(mfrow=c(3,1))
barplot(peakFeaturesIst$percentage / peakFeaturesAll$percentage,
        main="Intersection across normalization methods",
        xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)
barplot(peakFeaturesGcfq$percentage / peakFeaturesAll$percentage,
        main="GC-FQ and FQ-FQ but not FQ",
        xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)
barplot(peakFeaturesCqn$percentage / peakFeaturesAll$percentage,
        main="Peaks only found by cqn",
        xlab="Genomic feature", ylab="Enrichment relative to all peaks") ; abline(h=1, lty=2, col="red", lwd=2)

# dev.off()